{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# This script is used for the online course \"资金流入流出\" in Tianchi platform\n",
    "\n",
    "                                                                           Author: Chuan-yu Xue\n",
    "                                                                           Email: cs_xcy@126.com"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import sklearn as skr\n",
    "import numpy as np\n",
    "import datetime\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "from dateutil.relativedelta import relativedelta"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load the balance data\n",
    "def load_data(path: str = 'user_balance_table.csv')->pd.DataFrame:\n",
    "    data_balance = pd.read_csv(path)\n",
    "    data_balance = add_timestamp(data_balance)\n",
    "    return data_balance.reset_index(drop=True)\n",
    "    \n",
    "\n",
    "# add tiemstamp to dataset\n",
    "def add_timestamp(data: pd.DataFrame, time_index: str = 'report_date')->pd.DataFrame:\n",
    "    data_balance = data.copy()\n",
    "    data_balance['date'] = pd.to_datetime(data_balance[time_index], format= \"%Y%m%d\")\n",
    "    data_balance['day'] = data_balance['date'].dt.day\n",
    "    data_balance['month'] = data_balance['date'].dt.month\n",
    "    data_balance['year'] = data_balance['date'].dt.year\n",
    "    data_balance['week'] = data_balance['date'].dt.week\n",
    "    data_balance['weekday'] = data_balance['date'].dt.weekday\n",
    "    return data_balance.reset_index(drop=True)\n",
    "\n",
    "# total amount\n",
    "def get_total_balance(data: pd.DataFrame, date: str = '2014-03-31')->pd.DataFrame:\n",
    "    df_tmp = data.copy()\n",
    "    df_tmp = df_tmp.groupby(['date'])['total_purchase_amt','total_redeem_amt'].sum()\n",
    "    df_tmp.reset_index(inplace=True)\n",
    "    return df_tmp[(df_tmp['date']>= date)].reset_index(drop=True)\n",
    "\n",
    "# Generate the test data\n",
    "def generate_test_data(data: pd.DataFrame)->pd.DataFrame:\n",
    "    total_balance = data.copy()\n",
    "    start = datetime.datetime(2014,9,1)\n",
    "    testdata = []\n",
    "    while start != datetime.datetime(2014,10,15):\n",
    "        temp = [start, np.nan, np.nan]\n",
    "        testdata.append(temp)\n",
    "        start += datetime.timedelta(days = 1)\n",
    "    testdata = pd.DataFrame(testdata)\n",
    "    testdata.columns = total_balance.columns\n",
    "\n",
    "    total_balance = pd.concat([total_balance, testdata], axis = 0)\n",
    "    total_balance = total_balance.reset_index(drop=True)\n",
    "    return total_balance.reset_index(drop=True)\n",
    "\n",
    "# Load user's information\n",
    "def load_user_information(path: str = 'user_profile_table.csv')->pd.DataFrame:\n",
    "    return pd.read_csv(path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 载入数据\n",
    "\n",
    "balance_data = load_data('Dataset/user_balance_table.csv')\n",
    "balance_data = add_timestamp(balance_data)\n",
    "total_balance = get_total_balance(balance_data, date = '2014-03-01')\n",
    "total_balance = generate_test_data(total_balance)\n",
    "total_balance = add_timestamp(total_balance, 'date')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 创建数据的深层拷贝\n",
    "\n",
    "data = total_balance.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 定义生成时间序列规则预测结果的方法\n",
    "\n",
    "def generate_base(df: pd.DataFrame, month_index: int)->pd.DataFrame:\n",
    "    # 选中固定时间段的数据集\n",
    "    total_balance = df.copy()\n",
    "    total_balance = total_balance[['date','total_purchase_amt','total_redeem_amt']]\n",
    "    total_balance = total_balance[(total_balance['date'] >= datetime.date(2014,3,1)) & (total_balance['date'] < datetime.date(2014, month_index, 1))]\n",
    "\n",
    "    # 加入时间戳\n",
    "    total_balance['weekday'] = total_balance['date'].dt.weekday\n",
    "    total_balance['day'] = total_balance['date'].dt.day\n",
    "    total_balance['week'] = total_balance['date'].dt.week\n",
    "    total_balance['month'] = total_balance['date'].dt.month\n",
    "    \n",
    "    # 统计翌日因子\n",
    "    mean_of_each_weekday = total_balance[['weekday']+['total_purchase_amt','total_redeem_amt']].groupby('weekday',as_index=False).mean()\n",
    "    for name in ['total_purchase_amt','total_redeem_amt']:\n",
    "        mean_of_each_weekday = mean_of_each_weekday.rename(columns={name: name+'_weekdaymean'})\n",
    "    mean_of_each_weekday['total_purchase_amt_weekdaymean'] /= np.mean(total_balance['total_purchase_amt'])\n",
    "    mean_of_each_weekday['total_redeem_amt_weekdaymean'] /= np.mean(total_balance['total_redeem_amt'])\n",
    "\n",
    "    # 合并统计结果到原数据集\n",
    "    total_balance = pd.merge(total_balance, mean_of_each_weekday, on='weekday', how='left')\n",
    "\n",
    "    # 分别统计翌日在(1~31)号出现的频次\n",
    "    weekday_count = total_balance[['day','weekday','date']].groupby(['day','weekday'],as_index=False).count()\n",
    "    weekday_count = pd.merge(weekday_count, mean_of_each_weekday, on='weekday')\n",
    "\n",
    "    # 依据频次对翌日因子进行加权，获得日期因子\n",
    "    weekday_count['total_purchase_amt_weekdaymean'] *= weekday_count['date']   / len(np.unique(total_balance['month']))\n",
    "    weekday_count['total_redeem_amt_weekdaymean'] *= weekday_count['date']  / len(np.unique(total_balance['month']))\n",
    "    day_rate = weekday_count.drop(['weekday','date'],axis=1).groupby('day',as_index=False).sum()\n",
    "\n",
    "    # 将训练集中所有日期的均值剔除日期残差得到base\n",
    "    day_mean = total_balance[['day'] + ['total_purchase_amt','total_redeem_amt']].groupby('day',as_index=False).mean()\n",
    "    day_pre = pd.merge(day_mean, day_rate, on='day', how='left')\n",
    "    day_pre['total_purchase_amt'] /= day_pre['total_purchase_amt_weekdaymean']\n",
    "    day_pre['total_redeem_amt'] /= day_pre['total_redeem_amt_weekdaymean']\n",
    "\n",
    "    # 生成测试集数据\n",
    "    for index, row in day_pre.iterrows():\n",
    "        if month_index in (2,4,6,9) and row['day'] == 31:\n",
    "            break\n",
    "        day_pre.loc[index, 'date'] = datetime.datetime(2014, month_index, int(row['day']))\n",
    "\n",
    "    # 基于base与翌日因子获得最后的预测结果\n",
    "    day_pre['weekday'] = day_pre.date.dt.weekday\n",
    "    day_pre = day_pre[['date','weekday']+['total_purchase_amt','total_redeem_amt']]\n",
    "    day_pre = pd.merge(day_pre, mean_of_each_weekday,on='weekday')\n",
    "    day_pre['total_purchase_amt'] *= day_pre['total_purchase_amt_weekdaymean']\n",
    "    day_pre['total_redeem_amt'] *= day_pre['total_redeem_amt_weekdaymean']\n",
    "\n",
    "    day_pre = day_pre.sort_values('date')[['date']+['total_purchase_amt','total_redeem_amt']]\n",
    "    return day_pre"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\ProgramData\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:7: FutureWarning: Comparing Series of datetimes with 'datetime.date'.  Currently, the\n",
      "'datetime.date' is coerced to a datetime. In the future pandas will\n",
      "not coerce, and a TypeError will be raised. To retain the current\n",
      "behavior, convert the 'datetime.date' to a datetime with\n",
      "'pd.Timestamp'.\n",
      "  import sys\n"
     ]
    }
   ],
   "source": [
    "# 生成预测结果（以及残差）\n",
    "\n",
    "base_list = []\n",
    "for i in range(4, 10):\n",
    "    base_list.append(generate_base(data, i).reset_index(drop=True))\n",
    "\n",
    "base = pd.concat(base_list).reset_index(drop=True)\n",
    "for i in ['total_purchase_amt','total_redeem_amt']:\n",
    "    base = base.rename(columns={i: i+'_base'})\n",
    "\n",
    "data = pd.merge(data.reset_index(drop=True), base.reset_index(drop=True), on='date', how='left').reset_index(drop=True)\n",
    "\n",
    "data['purchase_residual'] = data['total_purchase_amt'] / data['total_purchase_amt_base']\n",
    "\n",
    "data['redeem_residual'] = data['total_redeem_amt'] / data['total_redeem_amt_base']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 对结果表重命名\n",
    "\n",
    "data = data[['date','purchase_residual','redeem_residual','total_purchase_amt_base', 'total_redeem_amt_base']]\n",
    "for i in data.columns:\n",
    "    if i == 'date':\n",
    "        data[i] = data[i].astype(str)\n",
    "        data[i] = data[i].str.replace('-','')\n",
    "data.columns = [['date'] + ['total_purchase_amt','total_redeem_amt'] + ['total_purchase_predicted_by_cycle','total_redeem_predicted_by_cycle'] ]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 保存预测结果到本地\n",
    "\n",
    "data.to_csv('Dataset/base.csv',index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
